{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "### How to Extract Legal Citations using Python (for the complete beginner)\n",
    "### Law and Courts Newsletter\n",
    "### Rachael K. Hinkle\n",
    "### June 6, 2022\n",
    "### Run on macOS 10.15.7\n",
    "\n",
    "### The only difference in this code for Windows is the way filepaths are set up. They use a different file seperator.\n",
    "## Macs use a single foward slash; if you are using a mac run the follwing code:\n",
    "fileSeperator = \"/\"\n",
    "## Windows uses a double back slash: if you are using Windows use the following code,\n",
    "# Note, to do this delete the \"#\" at the beginning of the next line.\n",
    "#fileSeperator = \"\\\\\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/javascript": [
       "IPython.notebook.set_autosave_interval(0)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Autosave disabled\n"
     ]
    }
   ],
   "source": [
    "# The following code turns off the autosave feature to avoid inadvertently saving over a previous file.\n",
    "# As always, remember to save your work as you go along.\n",
    "%autosave 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.6.13 |Anaconda, Inc.| (default, Feb 23 2021, 12:58:59) \n",
      "[GCC Clang 10.0.0 ]\n"
     ]
    }
   ],
   "source": [
    "## Preliminary Step 1: It helps to verify we are running Python 3.6 before proceeding\n",
    "\n",
    "# Python tip: Any text preceeded by a \"#\" is ignored by Python.\n",
    "\n",
    "import sys\n",
    "print(sys.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Preliminary Step 2: Install the lexNLP package (only need to do this the first time)\n",
    "# Note: This step will take a bit of time.  When an asterisk appears in the square brackets\n",
    "# to the left of a cell (e.g. \"In [*]\"), that means the code is running.  A number will\n",
    "# appear in the square brackets when the code is done running."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: lexnlp in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (1.8.0)\n",
      "Requirement already satisfied: num2words==0.5.10 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (0.5.10)\n",
      "Requirement already satisfied: datefinder-lexpredict==0.6.2.1 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (0.6.2.1)\n",
      "Requirement already satisfied: dateparser==0.7.2 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (0.7.2)\n",
      "Requirement already satisfied: regex==2020.7.14 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (2020.7.14)\n",
      "Requirement already satisfied: requests==2.24.0 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (2.24.0)\n",
      "Collecting numpy==1.19.1\n",
      "  Using cached numpy-1.19.1-cp36-cp36m-macosx_10_9_x86_64.whl (15.3 MB)\n",
      "Requirement already satisfied: pycountry==20.7.3 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (20.7.3)\n",
      "Requirement already satisfied: Unidecode==1.1.1 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (1.1.1)\n",
      "Requirement already satisfied: nltk==3.5 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (3.5)\n",
      "Requirement already satisfied: reporters-db==2.0.3 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (2.0.3)\n",
      "Requirement already satisfied: gensim==3.8.3 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (3.8.3)\n",
      "Collecting scipy==1.5.1\n",
      "  Using cached scipy-1.5.1-cp36-cp36m-macosx_10_9_x86_64.whl (28.8 MB)\n",
      "Requirement already satisfied: joblib==0.14.0 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (0.14.0)\n",
      "Requirement already satisfied: us==2.0.2 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (2.0.2)\n",
      "Requirement already satisfied: scikit-learn==0.23.1 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from lexnlp) (0.23.1)\n",
      "Collecting pandas==0.24.2\n",
      "  Using cached pandas-0.24.2-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (16.3 MB)\n",
      "Requirement already satisfied: pytz in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from datefinder-lexpredict==0.6.2.1->lexnlp) (2021.3)\n",
      "Requirement already satisfied: python-dateutil>=2.4.2 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from datefinder-lexpredict==0.6.2.1->lexnlp) (2.8.2)\n",
      "Requirement already satisfied: tzlocal in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from dateparser==0.7.2->lexnlp) (4.1)\n",
      "Requirement already satisfied: smart-open>=1.8.1 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from gensim==3.8.3->lexnlp) (5.2.1)\n",
      "Requirement already satisfied: six>=1.5.0 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from gensim==3.8.3->lexnlp) (1.16.0)\n",
      "Requirement already satisfied: tqdm in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from nltk==3.5->lexnlp) (4.62.3)\n",
      "Requirement already satisfied: click in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from nltk==3.5->lexnlp) (8.0.3)\n",
      "Requirement already satisfied: docopt>=0.6.2 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from num2words==0.5.10->lexnlp) (0.6.2)\n",
      "Requirement already satisfied: idna<3,>=2.5 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from requests==2.24.0->lexnlp) (2.10)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from requests==2.24.0->lexnlp) (1.25.11)\n",
      "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from requests==2.24.0->lexnlp) (3.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from requests==2.24.0->lexnlp) (2021.5.30)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from scikit-learn==0.23.1->lexnlp) (3.1.0)\n",
      "Requirement already satisfied: jellyfish==0.6.1 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from us==2.0.2->lexnlp) (0.6.1)\n",
      "Requirement already satisfied: importlib-metadata in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from click->nltk==3.5->lexnlp) (4.8.3)\n",
      "Requirement already satisfied: typing-extensions>=3.6.4 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from importlib-metadata->click->nltk==3.5->lexnlp) (4.1.1)\n",
      "Requirement already satisfied: zipp>=0.5 in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from importlib-metadata->click->nltk==3.5->lexnlp) (3.6.0)\n",
      "Requirement already satisfied: pytz-deprecation-shim in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from tzlocal->dateparser==0.7.2->lexnlp) (0.1.0.post0)\n",
      "Requirement already satisfied: backports.zoneinfo in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from tzlocal->dateparser==0.7.2->lexnlp) (0.2.1)\n",
      "Requirement already satisfied: importlib-resources in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from backports.zoneinfo->tzlocal->dateparser==0.7.2->lexnlp) (5.4.0)\n",
      "Requirement already satisfied: tzdata in /opt/anaconda3/envs/Python3.6Test/lib/python3.6/site-packages (from pytz-deprecation-shim->tzlocal->dateparser==0.7.2->lexnlp) (2021.5)\n",
      "Installing collected packages: numpy, scipy, pandas\n",
      "  Attempting uninstall: numpy\n",
      "    Found existing installation: numpy 1.19.2\n",
      "    Uninstalling numpy-1.19.2:\n",
      "      Successfully uninstalled numpy-1.19.2\n",
      "  Attempting uninstall: scipy\n",
      "    Found existing installation: scipy 1.5.2\n",
      "    Uninstalling scipy-1.5.2:\n",
      "      Successfully uninstalled scipy-1.5.2\n",
      "  Attempting uninstall: pandas\n",
      "    Found existing installation: pandas 1.1.5\n",
      "    Uninstalling pandas-1.1.5:\n",
      "      Successfully uninstalled pandas-1.1.5\n",
      "Successfully installed numpy-1.19.1 pandas-0.24.2 scipy-1.5.1\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip install lexnlp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Preliminary Step 3: Import the libraries you will need for this session (analogous to R packages)\n",
    "\n",
    "# 'os' provides useful funtions for dealing with the operating system\n",
    "# 're' provides the ability to use regular expressions\n",
    "# 'csv' provides the ability to write the results to a .csv file\n",
    "# 'string' provides useful functions for processing the opinion content\n",
    "# 'lexnlp' provides the ability to extract citations from natural language\n",
    "\n",
    "import os, re, csv, string, lexnlp\n",
    "import lexnlp.extract.en.citations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/Users/rachaelhinkle/Dropbox/Law_and_Courts_Newsletter_2022/Hinkle_L&C_Newsletter_Dataverse_Files2'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Preliminary Step 4: See the name of your working directory\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Preliminary Step 5: Setup a shortcut reference to your working directory\n",
    "\n",
    "# Python tip: This just creates a variable with a string. \n",
    "#             You can name the variable anything you want.\n",
    "\n",
    "mydir = os.getcwd() + fileSeperator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Illustration 1: How to extract citations from a single file\n",
    "## Note: If you are familiar with Python, you can proceed to Illustration 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['\\n', '\\n', '             UNITED STATES OF AMERICA, Plaintiff-Appellee, v. KORVELL\\n', '                       DENNIS PITTMAN, Defendant-Appellant.\\n', '\\n', '                                   No. 04-2546\\n', '\\n', '              UNITED STATES COURT OF APPEALS FOR THE SEVENTH CIRCUIT\\n', '\\n', '                     411 F.3d 813; 2005 U.S. App. LEXIS 11067\\n', '\\n', '                              April 5, 2005, Argued\\n', '                              June 13, 2005, Decided\\n', '\\n', '\\n', 'COUNSEL: For UNITED STATES OF AMERICA, Plaintiff-Appellee: John K. Mehochko,\\n', 'OFFICE OF THE UNITED STATES ATTORNEY, Rock Island, IL USA.\\n', '\\n', 'For KORVELL D. PITTMAN, Defendant-Appellant: George F. Taseff, OFFICE OF THE\\n', 'FEDERAL PUBLIC DEFENDER, Peoria, IL USA.\\n', '\\n', 'OPINION BY: POSNER\\n', '\\n', 'OPINION\\n', '\\n', '     [*814]  POSNER, Circuit Judge. The defendant pleaded guilty to being a felon\\n', 'in possession of a firearm and was sentenced under the federal sentencing\\n', \"guidelines (before the Supreme Court's Booker decision) to 188 months in prison,\\n\", 'the bottom of the applicable guideline range but only eight months above the\\n', 'statutory minimum. [HN1] The Armed Career Criminal Act, 18 U.S.C.  924(e),\\n', 'imposes a 180-month minimum sentence on anyone  [*815]  who has at least three\\n', \"prior convictions of specified offenses. The judge found that the defendant's\\n\", 'criminal record qualified under this provision, and [HN2] findings of prior\\n', \"convictions are not within the scope of Booker's rule. United States v. Booker,\\n\", '160 L. Ed. 2d 621, 125 S. Ct. 738, 756 (2005); United States v. Ngo, 406 F.3d\\n', '839, 2005 WL 1023034, at *2-4 (7th Cir. 2005)  [**2]  .\\n', '\\n']\n"
     ]
    }
   ],
   "source": [
    "## Step 1.1: Read in an opinion (or other legal document) in a .txt file\n",
    "# This code creates a list; each entry of the list is a string that contains the text \n",
    "# from one line in the file\n",
    "\n",
    "# Python tip: The \"print()\" function is very useful for seeing the contents of variable\n",
    "#             to see or double check what the code is doing\n",
    "\n",
    "# Note: If you get an error, check that the \"mydir\" variable correctly lists your directory\n",
    "# where the file is located, and/or make sure you have downloaded the \"myExample.txt\" file from Dataverse.\n",
    "\n",
    "tfile = open(mydir + \"myExample.txt\", \"r\", encoding=\"latin-1\")\n",
    "lines = tfile.readlines()\n",
    "print(lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OPINION [*814] POSNER, Circuit Judge. The defendant pleaded guilty to being a felon in possession of a firearm and was sentenced under the federal sentencing guidelines (before the Supreme Court's Booker decision) to 188 months in prison, the bottom of the applicable guideline range but only eight months above the statutory minimum. [HN1] The Armed Career Criminal Act, 18 U.S.C. 924(e), imposes a 180-month minimum sentence on anyone [*815] who has at least three prior convictions of specified offenses. The judge found that the defendant's criminal record qualified under this provision, and [HN2] findings of prior convictions are not within the scope of Booker's rule. United States v. Booker, 160 L. Ed. 2d 621, 125 S. Ct. 738, 756 (2005); United States v. Ngo, 406 F.3d 839, 2005 WL 1023034, at *2-4 (7th Cir. 2005) [**2] . \n"
     ]
    }
   ],
   "source": [
    "## Step 1.2: Process the list from the previous step to create a single string variable with \n",
    "#           only the majority opinion content\n",
    "\n",
    "#create an empty string variable (that we will later fill with the majority opinion)\n",
    "opinText = \"\" \n",
    "\n",
    "#create a logical operator variable (think of this as a switch set to \"off\")\n",
    "opin = False \n",
    "\n",
    "# loop through each line in the document in turn, after each line it goes to the next,\n",
    "# when there are no longer any lines left, the loop will stop\n",
    "for line in lines: \n",
    "    # Check to see if the line starts with some or no white space followed by \"OPINION\" and then the end of the line\n",
    "    # If so, turn the switch \"on\" (by making the logical operator variable equal True)\n",
    "    # Once this switch is turned on, it will remain that way until we turn it off\n",
    "    if re.search(\"^[\\s]*OPINION$\", line): \n",
    "        opin = True\n",
    "    \n",
    "    # Check for various strings that would indicate the end of the majority opinion\n",
    "    # If any of these patters of text are found, turn the switch \"off\" by making the logical operator variable equal False\n",
    "    if line and (re.search(\"^[\\s]*DISSENT\", line)):\n",
    "        opin = False\n",
    "    if line and (re.search(\"^[\\s]*CONCUR\", line)):\n",
    "        opin = False\n",
    "    if line and (re.search(\"^[^A-Za-z]*APPENDI\", line)):\n",
    "        opin = False\n",
    "    \n",
    "    # For any line of text that occurs while the switch is \"on\", that is not empty, add it\n",
    "    # to the string variable where we are collecting the majority opinion text.\n",
    "    if opin and not re.search(\"^[\\s]*$\", line):\n",
    "        opinText = opinText + line\n",
    "        \n",
    "# Now that the loop is complete, and we have the entire opinion, we can process it\n",
    "# This code finds any instances of one of more spaces and substitutes a single space\n",
    "opinText = re.sub(\"\\s+\", \" \", opinText)\n",
    "\n",
    "#Finally, we want to see our final product\n",
    "print(opinText)\n",
    "\n",
    "# Python tip: some elements of regular epxressions used in the code above:\n",
    "# ^: beginning of the line\n",
    "# $: end of the line\n",
    "# [\\s]*: zero or more white spaces\n",
    "# [\\s]+: one or more white spaces\n",
    "# [^A-Za-z]: Anything that is NOT a capital or lower case letter.  \n",
    "#  (Inside the square brackets \"^\" means NOT, outside the brackets it means beginning of the line)\n",
    "# [A-Z] means any capital letter\n",
    "# [a-z] means any lower case letter\n",
    "# [A-z] means any letter\n",
    "# [0-9] means any numeral\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(160, 'L. Ed. 2d', \"Lawyer's Edition\", 621, '125', None, None)\n",
      "(406, 'F.3d', 'Federal Reporter', 839, '2005', None, None)\n"
     ]
    }
   ],
   "source": [
    "## Step 1.3: Extract citations from the opinion\n",
    "\n",
    "#Extract each ciation and put it in a variable\n",
    "cite_holder = lexnlp.extract.en.citations.get_citations(opinText) \n",
    "\n",
    "# See the citations extracted by using a loop\n",
    "# Note: It doesn't work to just print the variable, but you can print (or otherwise access)\n",
    "#       each item in it by using a loop.\n",
    "for cite in cite_holder:\n",
    "    print(cite)\n",
    "    \n",
    "# From the output we can see that each citation has multiple pieces of information\n",
    "# Illustration 2 will show how to get key information into a spreadsheet for further analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Illustration 2: How to extract citations from all files in a subfolder\n",
    "###                 and output the data to a spreadsheet with one row per citation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['filename', 'volume', 'reporter', 'page1', 'page2', 'court', 'year', 'raw citation']\n"
     ]
    }
   ],
   "source": [
    "## Step 2.1: Create and open a .csv file that we will write information into\n",
    "fout = open(mydir + \"myNewSpreadsheet.csv\", \"w\", newline=\"\")\n",
    "outfilehandle = csv.writer(fout,\n",
    "                           delimiter=\",\",\n",
    "                           quotechar='\"',\n",
    "                           quoting=csv.QUOTE_NONNUMERIC)\n",
    "\n",
    "# Create a row that contains the variables names and write that to the first row of the .csv file\n",
    "localrow = []\n",
    "localrow.append(\"filename\")\n",
    "localrow.append(\"volume\")\n",
    "localrow.append(\"reporter\")\n",
    "localrow.append(\"page1\")\n",
    "localrow.append(\"page2\")\n",
    "localrow.append(\"court\")\n",
    "localrow.append(\"year\")\n",
    "localrow.append(\"raw citation\")\n",
    "#localrow.append(\"numCites\")\n",
    "#localrow.append(\"numIds\")\n",
    "outfilehandle.writerow(localrow)\n",
    "\n",
    "print(localrow)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['13638.842.F.2d.509.txt', '14374.967.F.2d.73.txt', '16894.436.F.3d.174.txt', '2761.831.F.2d.538.txt', '7434.544.F.3d.449.txt']\n"
     ]
    }
   ],
   "source": [
    "## Step 2.2: Prepare information for where to read text files from\n",
    "\n",
    "#create a filepath to the folder where all text files are located\n",
    "dirname = mydir + \"sampleOpinions\" + fileSeperator\n",
    "# Create a list of all files in a given folder\n",
    "dirlist = os.listdir(dirname)\n",
    "# See the list of all files to make sure things are working\n",
    "print(dirlist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13638.842.F.2d.509.txt\n",
      "14374.967.F.2d.73.txt\n",
      "16894.436.F.3d.174.txt\n",
      "2761.831.F.2d.538.txt\n",
      "7434.544.F.3d.449.txt\n"
     ]
    }
   ],
   "source": [
    "## Step 2.3: Loop through all documents, process text, extract ciations, process citations,\n",
    "##           and export to spreadsheet\n",
    "\n",
    "for entry in dirlist: ## each entry is a separate txt file with a single opinion \n",
    "    infilehandle = open(dirname + entry, encoding='latin-1')\n",
    "    txtlines = infilehandle.readlines()\n",
    "    \n",
    "    # This is useful for watching the progress of the code, espcially with many documents\n",
    "    print(entry)\n",
    "    \n",
    "    # Create empty variables (each time through the loop it is important to make sure\n",
    "    # all variables are empty so values from the previous document do not carry over)\n",
    "    opin_string = \"\"\n",
    "    #opin_string2 = \"\"\n",
    "    #opin_string3 = \"\"\n",
    "    #opin_string4 = \"\"\n",
    "    sent_holder = []\n",
    "    cite_holder = []\n",
    "    localrow = []\n",
    "    sct = 0\n",
    "    op_line = False\n",
    "\n",
    "    # Loop through each line in the document\n",
    "    for txtline in txtlines:\n",
    "        \n",
    "        # Find the beginning and end of the opinion\n",
    "        if re.search(\"^[^A-Za-z]*OPINION[\\s]*$\", txtline):\n",
    "            op_line = True       \n",
    "        if line and (re.search(\"^[\\s]*DISSENT\", line)):\n",
    "            opin = False\n",
    "        if line and (re.search(\"^[\\s]*CONCUR\", line)):\n",
    "            opin = False\n",
    "        if line and (re.search(\"^[^A-Za-z]*APPENDI\", line)):\n",
    "            opin = False\n",
    "       \n",
    "        # Create a string with the majority opinion and process the text\n",
    "        if op_line:\n",
    "            opin_string = opin_string + txtline\n",
    "\n",
    "    ## After looping through all lines in the document, do the following steps once per document   \n",
    "    \n",
    "    ## Process text in opinion string\n",
    "    \n",
    "    # Get rid of newline characters\n",
    "    opin_string = re.sub(\"\\n\", \" \", opin_string)\n",
    "    # Replace short citations with the placeholder \"999 U.S. 999\" that will be recognized by lexnlp\n",
    "    opin_string = re.sub(\"[\\s](I|i)d\\.[\\s]\", \" 999 U.S. 999 \", opin_string)\n",
    "    opin_string = re.sub(\"[\\s](I|i)bid\\.[\\s]\", \" 999 U.S. 999 \", opin_string)\n",
    "            \n",
    "    # Extract citations\n",
    "    cite_holder = lexnlp.extract.en.citations.get_citations(opin_string)\n",
    "    \n",
    "    # Change \"cite_holder\" to a list object \n",
    "    cite_holder = list(cite_holder)\n",
    "    \n",
    "    # Create a second variable that replaces short references with the full citation\n",
    "    \n",
    "    #Pyton tip: Python indexes begin with zero.  So to extract the first element of the\n",
    "    #           variable \"cite_holder\", we can use \"cite_holder[0]\"\n",
    "    \n",
    "    cite_holder2 = []\n",
    "    # Since we need to refer back to the previous element of the loop for this task,\n",
    "    # we use a different kind of loop that uses an index, \"i\"\n",
    "    for i in range(0, len(cite_holder)):\n",
    "        if cite_holder[i][0] == 999: #and cite[1] == \"U\\.S\\.\" and cite[3] == 999:\n",
    "            cite_holder2.append(cite_holder2[i-1])\n",
    "        else:\n",
    "            cite_holder2.append(cite_holder[i])\n",
    "    \n",
    "    # Create empty variables that will be used for exporting information to spreadsheet\n",
    "    vol = \"\"\n",
    "    rep = \"\"\n",
    "    page1 = \"\"\n",
    "    page2 = \"\"\n",
    "    court = \"\"\n",
    "    year = \"\"\n",
    "    \n",
    "    # Put content in each variable that will be written to the spreadsheet\n",
    "    for cite in cite_holder2:\n",
    "        vol =str(cite[0])\n",
    "        rep = str(cite[1])\n",
    "        page1 = str(cite[3])\n",
    "        page2 = str(cite[4])\n",
    "        court = str(cite[5])\n",
    "        year = str(cite[6])\n",
    "        \n",
    "        #The lexnlp function does not perform perfectly, here is some code to clean up\n",
    "        # some of the issues by getting rid of info that is not correct\n",
    "        # Note: the \"raw citation\" column still has the full information for each cite\n",
    "        if len(court) > 20:\n",
    "            court = \"\"\n",
    "        if court == \"None\":\n",
    "            court = \"\"\n",
    "        if page2 == \"None\":\n",
    "            page2 = \"\"\n",
    "        if year == \"None\":\n",
    "            year = \"\"\n",
    "        if re.search(\"U\\.S\\.\", rep):\n",
    "            court = \"SCOTUS\"\n",
    "        if re.search(\"L\\.[\\s]*Ed\\.\", rep):\n",
    "            court = \"SCOTUS\"\n",
    "        if re.search(\"S\\.[\\s]*Ct\\.\", rep):\n",
    "            court = \"SCOTUS\"\n",
    "        \n",
    "        # Write a row to the spreadsheet for each ciation\n",
    "        if vol != \"\":\n",
    "            localrow = []\n",
    "            localrow.append(entry)\n",
    "            localrow.append(vol)\n",
    "            localrow.append(rep)\n",
    "            localrow.append(page1)\n",
    "            localrow.append(page2)\n",
    "            localrow.append(court)\n",
    "            localrow.append(year)\n",
    "            localrow.append(cite)\n",
    "            outfilehandle.writerow(localrow)\n",
    "\n",
    "# Close the files you are reading from and to\n",
    "infilehandle.close()\n",
    "fout.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
